Decide on how to process the data for various WQ parameters from discretewq
to be included in the long-term WQ publication.
# Load packages
library(tidyverse)
library(lubridate)
library(hms)
library(scales)
library(discretewq)
library(deltamapr)
library(sf)
library(leaflet)
library(dtplyr)
library(here)
# Check if we are in the correct working directory
i_am("Data_processing_methods.Rmd")
## here() starts at C:/Repositories/04_IEP_Org/WQ-LT-Publication
# Run session info to display package versions
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.1 (2022-06-23 ucrt)
## os Windows 10 x64 (build 19044)
## system x86_64, mingw32
## ui RTerm
## language (EN)
## collate English_United States.utf8
## ctype English_United States.utf8
## tz America/Los_Angeles
## date 2022-09-06
## pandoc 2.18 @ C:/Program Files/RStudio/bin/quarto/bin/tools/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## assertthat 0.2.1 2019-03-21 [1] CRAN (R 4.2.1)
## backports 1.4.1 2021-12-13 [1] CRAN (R 4.2.0)
## broom 1.0.0 2022-07-01 [1] CRAN (R 4.2.1)
## bslib 0.4.0 2022-07-16 [1] CRAN (R 4.2.1)
## cachem 1.0.6 2021-08-19 [1] CRAN (R 4.2.1)
## callr 3.7.1 2022-07-13 [1] CRAN (R 4.2.1)
## cellranger 1.1.0 2016-07-27 [1] CRAN (R 4.2.1)
## class 7.3-20 2022-01-16 [2] CRAN (R 4.2.1)
## classInt 0.4-7 2022-06-10 [1] CRAN (R 4.2.1)
## cli 3.3.0 2022-04-25 [1] CRAN (R 4.2.1)
## colorspace 2.0-3 2022-02-21 [1] CRAN (R 4.2.1)
## crayon 1.5.1 2022-03-26 [1] CRAN (R 4.2.1)
## crosstalk 1.2.0 2021-11-04 [1] CRAN (R 4.2.1)
## data.table 1.14.2 2021-09-27 [1] CRAN (R 4.2.1)
## DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.1)
## dbplyr 2.2.1 2022-06-27 [1] CRAN (R 4.2.1)
## deltamapr * 1.0.0 2021-06-18 [1] Github (InteragencyEcologicalProgram/deltamapr@d0a6f9c)
## devtools 2.4.4 2022-07-20 [1] CRAN (R 4.2.1)
## digest 0.6.29 2021-12-01 [1] CRAN (R 4.2.1)
## discretewq * 2.3.2.9000 2022-09-06 [1] Github (sbashevkin/discretewq@c910fa0)
## dplyr * 1.0.10 2022-09-01 [1] CRAN (R 4.2.1)
## dtplyr * 1.2.1 2022-01-19 [1] CRAN (R 4.2.1)
## e1071 1.7-11 2022-06-07 [1] CRAN (R 4.2.1)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.1)
## evaluate 0.15 2022-02-18 [1] CRAN (R 4.2.1)
## fansi 1.0.3 2022-03-24 [1] CRAN (R 4.2.1)
## fastmap 1.1.0 2021-01-25 [1] CRAN (R 4.2.1)
## forcats * 0.5.1 2021-01-27 [1] CRAN (R 4.2.1)
## fs 1.5.2 2021-12-08 [1] CRAN (R 4.2.1)
## gargle 1.2.0 2021-07-02 [1] CRAN (R 4.2.1)
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.1)
## ggplot2 * 3.3.6 2022-05-03 [1] CRAN (R 4.2.1)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.1)
## googledrive 2.0.0 2021-07-08 [1] CRAN (R 4.2.1)
## googlesheets4 1.0.0 2021-07-21 [1] CRAN (R 4.2.1)
## gtable 0.3.1 2022-09-01 [1] CRAN (R 4.2.1)
## haven 2.5.0 2022-04-15 [1] CRAN (R 4.2.1)
## here * 1.0.1 2020-12-13 [1] CRAN (R 4.2.1)
## hms * 1.1.1 2021-09-26 [1] CRAN (R 4.2.1)
## htmltools 0.5.3 2022-07-18 [1] CRAN (R 4.2.1)
## htmlwidgets 1.5.4 2021-09-08 [1] CRAN (R 4.2.1)
## httpuv 1.6.5 2022-01-05 [1] CRAN (R 4.2.1)
## httr 1.4.3 2022-05-04 [1] CRAN (R 4.2.1)
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.1)
## jsonlite 1.8.0 2022-02-22 [1] CRAN (R 4.2.1)
## KernSmooth 2.23-20 2021-05-03 [2] CRAN (R 4.2.1)
## knitr 1.39 2022-04-26 [1] CRAN (R 4.2.1)
## later 1.3.0 2021-08-18 [1] CRAN (R 4.2.1)
## leaflet * 2.1.1 2022-03-23 [1] CRAN (R 4.2.1)
## lifecycle 1.0.1 2021-09-24 [1] CRAN (R 4.2.1)
## lubridate * 1.8.0 2021-10-07 [1] CRAN (R 4.2.1)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.1)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.1)
## mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.1)
## modelr 0.1.8 2020-05-19 [1] CRAN (R 4.2.1)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.1)
## pillar 1.8.1 2022-08-19 [1] CRAN (R 4.2.1)
## pkgbuild 1.3.1 2021-12-20 [1] CRAN (R 4.2.1)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.1)
## pkgload 1.3.0 2022-06-27 [1] CRAN (R 4.2.1)
## prettyunits 1.1.1 2020-01-24 [1] CRAN (R 4.2.1)
## processx 3.7.0 2022-07-07 [1] CRAN (R 4.2.1)
## profvis 0.3.7 2020-11-02 [1] CRAN (R 4.2.1)
## promises 1.2.0.1 2021-02-11 [1] CRAN (R 4.2.1)
## proxy 0.4-27 2022-06-09 [1] CRAN (R 4.2.1)
## ps 1.7.1 2022-06-18 [1] CRAN (R 4.2.1)
## purrr * 0.3.4 2020-04-17 [1] CRAN (R 4.2.1)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.1)
## Rcpp 1.0.9 2022-07-08 [1] CRAN (R 4.2.1)
## readr * 2.1.2 2022-01-30 [1] CRAN (R 4.2.1)
## readxl 1.4.0 2022-03-28 [1] CRAN (R 4.2.1)
## remotes 2.4.2 2021-11-30 [1] CRAN (R 4.2.1)
## reprex 2.0.1 2021-08-05 [1] CRAN (R 4.2.1)
## rlang 1.0.5 2022-08-31 [1] CRAN (R 4.2.1)
## rmarkdown 2.14 2022-04-25 [1] CRAN (R 4.2.1)
## rprojroot 2.0.3 2022-04-02 [1] CRAN (R 4.2.1)
## rstudioapi 0.13 2020-11-12 [1] CRAN (R 4.2.1)
## rvest 1.0.2 2021-10-16 [1] CRAN (R 4.2.1)
## sass 0.4.2 2022-07-16 [1] CRAN (R 4.2.1)
## scales * 1.2.1 2022-08-20 [1] CRAN (R 4.2.1)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.1)
## sf * 1.0-8 2022-07-14 [1] CRAN (R 4.2.1)
## shiny 1.7.2 2022-07-19 [1] CRAN (R 4.2.1)
## stringi 1.7.8 2022-07-11 [1] CRAN (R 4.2.1)
## stringr * 1.4.1 2022-08-20 [1] CRAN (R 4.2.1)
## tibble * 3.1.8 2022-07-22 [1] CRAN (R 4.2.1)
## tidyr * 1.2.0 2022-02-01 [1] CRAN (R 4.2.1)
## tidyselect 1.1.2 2022-02-21 [1] CRAN (R 4.2.1)
## tidyverse * 1.3.2 2022-07-18 [1] CRAN (R 4.2.1)
## tzdb 0.3.0 2022-03-28 [1] CRAN (R 4.2.1)
## units 0.8-0 2022-02-05 [1] CRAN (R 4.2.1)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.1)
## usethis 2.1.6 2022-05-25 [1] CRAN (R 4.2.1)
## utf8 1.2.2 2021-07-24 [1] CRAN (R 4.2.1)
## vctrs 0.4.1 2022-04-13 [1] CRAN (R 4.2.1)
## withr 2.5.0 2022-03-03 [1] CRAN (R 4.2.1)
## xfun 0.31 2022-05-10 [1] CRAN (R 4.2.1)
## xml2 1.3.3 2021-11-30 [1] CRAN (R 4.2.1)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.1)
## yaml 2.3.5 2022-02-21 [1] CRAN (R 4.2.1)
##
## [1] C:/R/win-library/4.2
## [2] C:/Program Files/R/R-4.2.1/library
##
## ──────────────────────────────────────────────────────────────────────────────
# Pull in the WQ data from discretewq
df_dwq <-
wq(
Sources = c(
"EMP",
"STN",
"FMWT",
"EDSM",
"DJFMP",
"SDO",
"SKT",
"SLS",
"20mm",
"Suisun",
"Baystudy",
"USBR",
"USGS_SFBS",
"YBFMP",
"USGS_CAWSC",
"NCRO"
)
)
# Load Delta Shapefile from Brian and only keep SubRegions east of Carquinez Straight
sf_delta <- R_EDSM_Subregions_Mahardja_FLOAT %>%
filter(
!SubRegion %in% c(
"Carquinez Strait",
"Lower Napa River",
"San Francisco Bay",
"San Pablo Bay",
"South Bay",
"Upper Napa River"
)
) %>%
select(SubRegion)
# Prepare WQ data for methods analysis
df_dwq_c <- df_dwq %>%
transmute(
Source,
Station,
Latitude,
Longitude,
Date = date(Date),
# Convert Datetime to PST
Datetime = with_tz(Datetime, tzone = "Etc/GMT+8"),
Temperature,
Salinity,
Secchi,
Chlorophyll,
DissAmmonia,
DissNitrateNitrite,
DissOrthophos
) %>%
# Remove records with NA values for all parameters
filter(
!if_all(
c(Temperature, Salinity, Secchi, Chlorophyll, DissAmmonia, DissNitrateNitrite, DissOrthophos),
is.na
)
) %>%
# Remove records without lat-long coordinates
drop_na(Latitude, Longitude) %>%
# Assign SubRegions to the stations
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE) %>%
st_transform(crs = st_crs(sf_delta)) %>%
st_join(sf_delta, join = st_intersects) %>%
# Remove any data outside our subregions of interest
filter(!is.na(SubRegion)) %>%
st_drop_geometry() %>%
# Add variables for adjusted calendar year, month, and season
# Adjusted calendar year: December-November, with December of the previous calendar year
# included with the following year
mutate(
Month = month(Date),
YearAdj = if_else(Month == 12, year(Date) + 1, year(Date)),
Season = case_when(
Month %in% 3:5 ~ "Spring",
Month %in% 6:8 ~ "Summer",
Month %in% 9:11 ~ "Fall",
Month %in% c(12, 1, 2) ~ "Winter"
)
) %>%
# Restrict data to 1975-2021
filter(YearAdj %in% 1975:2021)
Now that we’ve been making updates to the WQ data in the discretewq
package, let’s take a look at which surveys we can use for the
long-term WQ publication. First, we’ll look at the temporal scale of all
of the surveys available.
# Number of Years for each survey
df_dwq_c %>%
distinct(Source, YearAdj) %>%
count(Source, name = "NumYears") %>%
arrange(desc(NumYears))
## # A tibble: 16 × 2
## Source NumYears
## <chr> <int>
## 1 EMP 47
## 2 FMWT 47
## 3 STN 47
## 4 DJFMP 46
## 5 USGS_CAWSC 44
## 6 Suisun 43
## 7 Baystudy 42
## 8 USGS_SFBS 42
## 9 20mm 27
## 10 YBFMP 24
## 11 DWR_NCRO 22
## 12 SDO 22
## 13 SKT 20
## 14 SLS 13
## 15 USBR 8
## 16 EDSM 5
# Period of record for each survey
df_dwq_c %>%
group_by(Source) %>%
summarize(min_date = min(Date), max_date = max(Date)) %>%
arrange(min_date)
## # A tibble: 16 × 3
## Source min_date max_date
## <chr> <date> <date>
## 1 EMP 1975-01-07 2021-11-16
## 2 USGS_SFBS 1975-01-15 2021-11-04
## 3 STN 1975-06-30 2021-08-19
## 4 FMWT 1975-09-17 2021-11-16
## 5 DJFMP 1976-05-13 2021-11-29
## 6 USGS_CAWSC 1977-06-14 2021-11-17
## 7 Suisun 1979-05-16 2021-08-12
## 8 Baystudy 1980-02-08 2020-12-01
## 9 20mm 1995-04-24 2021-07-16
## 10 SDO 1997-08-04 2018-11-08
## 11 YBFMP 1998-01-19 2021-11-30
## 12 DWR_NCRO 2000-04-11 2021-07-14
## 13 SKT 2002-01-07 2021-04-29
## 14 SLS 2009-01-05 2021-03-17
## 15 USBR 2012-05-08 2019-10-22
## 16 EDSM 2016-12-15 2021-11-26
Overall, for all parameters, it looks like all surveys except for SLS, USBR, and EDSM have collected at least 20 years of data. We will assume that these surveys have adequate temporal coverage for the long-term analysis.
# Only include surveys with adequate temporal coverage
df_dwq_lt <- df_dwq_c %>% filter(!Source %in% c("SLS", "USBR", "EDSM"))
Next, we’ll prepare the individual parameters separately for further analysis.
# Prepare data separately for each parameter
prep_samp_effort <- function(df, param) {
# Remove any NA values in parameter of interest
df_param <- df %>% drop_na(.data[[param]])
# Look for any instances when more than 1 data point was collected at a station-day
df_dups <- df_param %>%
count(Source, Station, Date) %>%
filter(n > 1) %>%
select(-n)
# Fix duplicates
df_dups_fixed <- df_param %>%
inner_join(df_dups, by = c("Source", "Station", "Date")) %>%
drop_na(Datetime) %>%
mutate(
# Create variable for time
Time = as_hms(Datetime),
# Calculate difference from noon for each data point for later filtering
Noon_diff = abs(hms(hours = 12) - Time)
) %>%
# Use dtplyr to speed up operations
lazy_dt() %>%
group_by(Station, Date) %>%
# Select only 1 data point per station and date, choose data closest to noon
filter(Noon_diff == min(Noon_diff)) %>%
# When points are equidistant from noon, select earlier point
filter(Time == min(Time)) %>%
ungroup() %>%
# End dtplyr operation
as_tibble() %>%
select(-c(Time, Noon_diff))
# Add back fixed duplicates and format data frame
df_param %>%
anti_join(df_dups, by = c("Source", "Station", "Date")) %>%
bind_rows(df_dups_fixed) %>%
select(
Source,
Station,
Latitude,
Longitude,
SubRegion,
YearAdj,
Month,
Season,
Date,
Datetime,
.data[[param]]
)
}
# Create a nested data frame to run functions on
ndf_dwq_lt <-
tibble(
Parameter = c(
"Temperature",
"Salinity",
"Secchi",
"DissAmmonia",
"DissNitrateNitrite",
"DissOrthophos",
"Chlorophyll"
),
df_data = rep(list(df_dwq_lt), 7)
) %>%
# Prepare data for each Parameter
mutate(df_data = map2(df_data, Parameter, .f = prep_samp_effort))
Next, let’s take a look at a map of all stations.
sf_stations <- df_dwq_lt %>%
distinct(Source, Station, Latitude, Longitude) %>%
# Convert to sf object
st_as_sf(coords = c("Longitude", "Latitude"), crs = 4326, remove = FALSE)
# Define color palette for Surveys
color_pal_survey <- colorFactor(palette = "viridis", domain = sf_stations$Source)
# Create map using leaflet
leaflet() %>%
addTiles() %>%
addCircleMarkers(
data = sf_stations,
radius = 5,
fillColor = ~color_pal_survey(Source),
fillOpacity = 0.8,
weight = 0.5,
color = "black",
opacity = 1,
label = paste0("Survey: ", sf_stations$Source, ", Station: ", sf_stations$Station)
) %>%
addLegend(
position = "topright",
pal = color_pal_survey,
values = sf_stations$Source,
title = "Survey:"
)
Some of the stations from the Suisun Marsh survey are located in small backwater channels and dead-end sloughs which represent a much different habitat than the sampling locations from the other surveys which tend to be in larger, open water channel habitat. We’ll keep the stations located in Suisun, Montezuma, and Nurse Sloughs from the Suisun Marsh survey, since they seem to be in the larger channels in the area.
Also, there are a few questionable sampling locations from SKT and YBFMP, but I don’t want to dig too deep with these for now.
ndf_dwq_lt_filt <- ndf_dwq_lt %>%
mutate(
df_data = map(
df_data,
~ filter(.x, !(Source == "Suisun" & !str_detect(Station, "^SU|^MZ|^NS")))
)
)
Now let’s take a closer look at the temporal data coverage for each Station and parameter.
# Plot function
plot_samp_effort_sta <- function(df) {
df %>%
count(Station, YearAdj, name = "num_samples") %>%
mutate(Station = fct_rev(factor(Station))) %>%
ggplot(aes(x = YearAdj, y = Station, fill = num_samples)) +
geom_tile() +
scale_x_continuous(
limits = c(1974, 2022),
breaks = breaks_pretty(20),
expand = expansion()
) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Create sampling effort by station plots for each Parameter and Source
ndf_dwq_se_sta_plt <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
ndf_data_source = map(df_data, ~ nest(.x, df_data2 = -Source)),
ndf_data_source = modify_depth(
ndf_data_source,
1,
~ mutate(.x, plt = map(df_data2, plot_samp_effort_sta))
)
)
DJFMP only sampled salinity for the past three years, so we won’t include this survey in the salinity analysis. For the USGS-CAWSC stations, only station 11447650 (Sacramento River at Freeport) was sampled on a long-term basis for Dissolved Ammonia, Nitrate+Nitrite, and Orthophosphate. Also, Chlorophyll wasn’t sampled consistently at any of the USGS-CAWSC stations. We’ll exclude this data from our data set before continuing.
ndf_dwq_lt_filt <- ndf_dwq_lt_filt %>%
mutate(
df_data = case_when(
Parameter == "Salinity" ~ modify_depth(df_data, 1, ~ filter(.x, Source != "DJFMP")),
str_detect(Parameter, "^Diss") ~ modify_depth(
df_data, 1, ~ filter(.x, !(Source == "USGS_CAWSC" & Station != "USGS-11447650"))),
Parameter == "Chlorophyll" ~ modify_depth(df_data, 1, ~ filter(.x, Source != "USGS_CAWSC")),
TRUE ~ df_data
)
)
Before we start removing subregions and stations, let’s take a look at the sampling effort for each subregion and parameter combination.
# Plot function
plot_samp_effort_subreg <- function(df) {
# Create a vector of all possible SubRegions
subreg_all <- sf_delta %>% distinct(SubRegion) %>% pull(SubRegion)
# Create plot
df %>%
count(SubRegion, YearAdj, name = "num_samples") %>%
mutate(SubRegion = fct_rev(factor(SubRegion, levels = subreg_all))) %>%
ggplot(aes(x = YearAdj, y = SubRegion, fill = num_samples)) +
geom_tile() +
scale_x_continuous(
limits = c(1974, 2022),
breaks = breaks_pretty(20),
expand = expansion()
) +
scale_y_discrete(drop = FALSE) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Create sampling effort by SubRegion plots for each Parameter
ndf_dwq_se_subreg_plt <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
plt = map(df_data, plot_samp_effort_subreg)
)
Since not all of the subregions were sampled consistently from 1975-2021, we’ll only keep the Subregions that contain data for at least 75% of the 47 years between 1975 to 2021, which is 35 years. We’ll first remove the subregions that have data for less than 35 years overall before considering the temporal coverage by month.
ndf_dwq_lt_filt <- ndf_dwq_lt_filt %>%
mutate(
df_subreg = map(
df_data,
~ distinct(.x, SubRegion, YearAdj) %>%
count(SubRegion, name = "NumYears") %>%
filter(NumYears >= num_yrs_threshold) # num_yrs_threshold = 35
),
df_data_filt = map2(
df_data, df_subreg,
~ filter(.x, SubRegion %in% unique(.y$SubRegion))
)
)
Let’s take a look at the sampling effort for each subregion and parameter by month for the remaining subregions.
# Plot function
plot_samp_effort_month <- function(df) {
df %>%
mutate(Month = fct_rev(month(Date, label = TRUE))) %>%
count(SubRegion, YearAdj, Month, name = "num_samples") %>%
ggplot(aes(x = YearAdj, y = Month, fill = num_samples)) +
geom_tile() +
facet_wrap(vars(SubRegion)) +
scale_x_continuous(
limits = c(1974, 2022),
breaks = breaks_pretty(10),
expand = expansion()
) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Create sampling effort by Month plots for each Parameter
ndf_dwq_se_month_plt <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
plt = map(df_data_filt, plot_samp_effort_month)
)
Now, we will require that a subregion has data for at least 35 years for each month to make sure that the months were sampled adequately.
ndf_dwq_lt_filt <- ndf_dwq_lt_filt %>%
mutate(
df_subreg_month = map(
df_data_filt,
~ distinct(.x, SubRegion, YearAdj, Month) %>%
count(SubRegion, Month, name = "NumYears") %>%
group_by(SubRegion) %>%
filter(min(NumYears) >= num_yrs_threshold) %>% # num_yrs_threshold = 35
ungroup()
),
df_data_filt_month = map2(
df_data_filt, df_subreg_month,
~ filter(.x, SubRegion %in% unique(.y$SubRegion))
)
)
Let’s take a look at which subregions remain and how it compares to what we did in the long-term analysis for the 2021 Drought Synthesis report.
# Import Chlorophyll data used in the long-term analysis for the 2021 Drought
# Synthesis report and add lat-long coordinates
df_chla_ds_lt <- read_csv(here("chla_data_stats_LT2.csv")) %>%
mutate(Source = if_else(Source == "USGS-SFBRMP", "USGS_SFBS", Source)) %>%
left_join(st_drop_geometry(sf_stations), by = c("Source", "Station"))
# Pull out SubRegions for each parameter after using the filtering steps by month
ndf_subreg_month <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
df_subreg = map(df_subreg_month, ~ distinct(.x, SubRegion))
) %>%
add_row(
Parameter = c(
"AllSubregions",
"DrtSynthWQ",
"DrtSynthNutr",
"DrtSynthChla"
),
df_subreg = list(
# Add all possible SubRegions from sf_delta
sf_delta %>% distinct(SubRegion),
# Add SubRegions used in the long-term analysis for the 2021 Drought Synthesis report
DroughtData::raw_wq_1975_2021 %>% distinct(SubRegion),
DroughtData::raw_nutr_1975_2021 %>% distinct(SubRegion),
df_chla_ds_lt %>% distinct(SubRegion)
),
.before = 1
)
# Create a list of character vectors to define the suffixes for the reducing full join
lst_suffix_month <- map(ndf_subreg_month$Parameter[-1], ~ c("", paste0("_", .x)))
# Create a custom function for the full join
join_subregions <- function(df1, df2, suff) {
full_join(df1, df2, by = "SubRegion", suffix = suff, keep = TRUE)
}
# Define parameter levels
param_levels <- c(
"DrtSynthWQ",
"Temperature",
"Salinity",
"Secchi",
"DrtSynthNutr",
"DissAmmonia",
"DissNitrateNitrite",
"DissOrthophos",
"DrtSynthChla",
"Chlorophyll"
)
# Run reducing full join and clean up data frame for plotting
df_subreg_month <- reduce2(ndf_subreg_month$df_subreg, lst_suffix_month, join_subregions) %>%
mutate(across(contains("_"), ~ if_else(!is.na(.x), "Yes", "No"))) %>%
rename_with(~ str_extract(.x, "(?<=_).+"), contains("_")) %>%
pivot_longer(cols = -SubRegion, names_to = "Parameter", values_to = "Included") %>%
mutate(
Parameter = factor(Parameter, levels = param_levels),
SubRegion = fct_rev(factor(SubRegion))
)
# Create a function for the SubRegion comparison tile plots
plot_reg_comp <- function(df) {
df %>%
ggplot(aes(x = Parameter, y = SubRegion, fill = Included)) +
geom_tile()
}
Filtering on a month level may be too restrictive, so let’s take a look at the sampling effort for each subregion and parameter by season.
# Plot function
plot_samp_effort_seas <- function(df) {
df %>%
count(SubRegion, YearAdj, Season, name = "num_samples") %>%
mutate(
SubRegion = fct_rev(factor(SubRegion)),
Season = factor(Season, levels = c("Winter", "Spring", "Summer", "Fall"))
) %>%
ggplot(aes(x = YearAdj, y = SubRegion, fill = num_samples)) +
geom_tile() +
facet_wrap(vars(Season)) +
scale_x_continuous(
limits = c(1974, 2022),
breaks = breaks_pretty(10),
expand = expansion()
) +
scale_fill_viridis_c(name = "Number of Samples") +
theme_bw() +
theme(
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
legend.position = "top"
)
}
# Create sampling effort by Season plots for each Parameter
ndf_dwq_se_seas_plt <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
plt = map(df_data_filt, plot_samp_effort_seas)
)
Instead of filtering on a monthly level, let’s see what happens if we require that a subregion has data for at least 35 years for each season to make sure that the seasons were sampled adequately.
ndf_dwq_lt_filt <- ndf_dwq_lt_filt %>%
mutate(
df_subreg_seas = map(
df_data_filt,
~ distinct(.x, SubRegion, YearAdj, Season) %>%
count(SubRegion, Season, name = "NumYears") %>%
group_by(SubRegion) %>%
filter(min(NumYears) >= num_yrs_threshold) %>% # num_yrs_threshold = 35
ungroup()
),
df_data_filt_seas = map2(
df_data_filt, df_subreg_seas,
~ filter(.x, SubRegion %in% unique(.y$SubRegion))
)
)
Again, set’s take a look at which subregions remain and how it compares to what we did in the long-term analysis for the 2021 Drought Synthesis report.
# Pull out SubRegions for each parameter after using the filtering steps by season
ndf_subreg_seas <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
df_subreg = map(df_subreg_seas, ~ distinct(.x, SubRegion))
) %>%
add_row(
Parameter = c(
"AllSubregions",
"DrtSynthWQ",
"DrtSynthNutr",
"DrtSynthChla"
),
df_subreg = list(
# Add all possible SubRegions from sf_delta
sf_delta %>% distinct(SubRegion),
# Add SubRegions used in the long-term analysis for the 2021 Drought Synthesis report
DroughtData::raw_wq_1975_2021 %>% distinct(SubRegion),
DroughtData::raw_nutr_1975_2021 %>% distinct(SubRegion),
df_chla_ds_lt %>% distinct(SubRegion)
),
.before = 1
)
# Create a list of character vectors to define the suffixes for the reducing full join
lst_suffix_seas <- map(ndf_subreg_seas$Parameter[-1], ~ c("", paste0("_", .x)))
# Run reducing full join and clean up data frame for plotting
df_subreg_seas <- reduce2(ndf_subreg_seas$df_subreg, lst_suffix_seas, join_subregions) %>%
mutate(across(contains("_"), ~ if_else(!is.na(.x), "Yes", "No"))) %>%
rename_with(~ str_extract(.x, "(?<=_).+"), contains("_")) %>%
pivot_longer(cols = -SubRegion, names_to = "Parameter", values_to = "Included") %>%
mutate(
Parameter = factor(Parameter, levels = param_levels),
SubRegion = fct_rev(factor(SubRegion))
)
Filtering the subregions by including only those that have data for
at least 35 years for each season seems to by slightly
more permissive than filtering on the monthly level for DissAmmonia and
Chlorophyll. Comparing the seasonal approach to the core station
approach as seen in discretewq
Sampling Effort document, the water quality parameters lose a few
subregions overall, and the nutrient and chlorophyll parameters gain a
few subregions.
I am leaning towards using the filtering subregions by the seasonal level approach. If we decide on this as our spatial and temporal filtering method, are all regions covered adequately for the analyses? Let’s see…
Let’s look at the regional coverage assuming that we’ll use the seasonal level approach to filter our data, and compare it to what we used in the long-term analysis for the 2021 Drought Synthesis report.
# Bring in Region-Subregion crosswalk from the Drought Synthesis and add Grant
# Line Canal and Old River subregion to it
df_regions_mod <- DroughtData:::df_regions %>%
distinct(SubRegion, Region) %>%
add_row(SubRegion = "Grant Line Canal and Old River", Region = "SouthCentral")
# Pull out Regions for each parameter after using the filtering steps by season,
# and count number of subregions in each region
ndf_regions <- ndf_dwq_lt_filt %>%
transmute(
Parameter,
df_region = map(df_data_filt_seas, ~ distinct(.x, SubRegion))
) %>%
add_row(
Parameter = c(
"DrtSynthWQ",
"DrtSynthNutr",
"DrtSynthChla"
),
df_region = list(
# Add SubRegions used in the long-term analysis for the 2021 Drought Synthesis report
DroughtData::raw_wq_1975_2021 %>% distinct(SubRegion),
DroughtData::raw_nutr_1975_2021 %>% distinct(SubRegion),
df_chla_ds_lt %>% distinct(SubRegion)
),
.before = 1
) %>%
mutate(
df_region = map(
df_region,
~ left_join(.x, df_regions_mod, by = "SubRegion") %>%
count(Region)
),
Parameter = factor(Parameter, levels = param_levels)
) %>%
arrange(Parameter) %>%
mutate(Parameter = as.character(Parameter))
# Print out region counts
for (i in 1:nrow(ndf_regions)) {
print(ndf_regions$Parameter[i])
print(ndf_regions$df_region[[i]])
}
## [1] "DrtSynthWQ"
## # A tibble: 5 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 10
## 4 Suisun Bay 4
## 5 Suisun Marsh 1
## [1] "Temperature"
## # A tibble: 5 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 6
## 4 Suisun Bay 4
## 5 Suisun Marsh 1
## [1] "Salinity"
## # A tibble: 5 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 5
## 4 Suisun Bay 4
## 5 Suisun Marsh 1
## [1] "Secchi"
## # A tibble: 5 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 6
## 4 Suisun Bay 4
## 5 Suisun Marsh 1
## [1] "DrtSynthNutr"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 8
## 4 Suisun Bay 4
## [1] "DissAmmonia"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 4
## 2 North 1
## 3 SouthCentral 4
## 4 Suisun Bay 3
## [1] "DissNitrateNitrite"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 4
## 2 North 1
## 3 SouthCentral 4
## 4 Suisun Bay 3
## [1] "DissOrthophos"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 4
## 2 North 1
## 3 SouthCentral 4
## 4 Suisun Bay 3
## [1] "DrtSynthChla"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 9
## 4 Suisun Bay 4
## [1] "Chlorophyll"
## # A tibble: 4 × 2
## Region n
## <chr> <int>
## 1 Confluence 6
## 2 North 1
## 3 SouthCentral 6
## 4 Suisun Bay 4
It looks like all of the same regions are covered as in the long-term analysis for the 2021 Drought Synthesis report with less subregions in some of the regions now.